package com.suty.craw.core.impl;

import java.io.*;
import java.util.regex.*;

import com.suty.craw.core.Analyser;

public class AnalyserImpl extends Analyser {

	@Override
	public boolean parsePage(ByteArrayOutputStream page) {
		String encoding[] = {"GBK", "gb2312", "UTF-8", "default"};
		
		String strPage = "";
		for(int i=0; i<4; i++){
			try {
				strPage = page.toString(encoding[i]);
				break;
			}
			catch(UnsupportedEncodingException ex) {
				continue;
			}
		}
		String regex = "<[a|A] href=\"http://.*?\">.*?</[a|A]>";
		Pattern p = Pattern.compile(regex);
		Matcher match = p.matcher(strPage);
		while(match.find()) {
			String link = match.group();
			
			String titleRegex = ">.*?</[a|A]>";
			Matcher m2 = Pattern.compile(titleRegex).matcher(link);
			while(m2.find()){
				String title = m2.group();
				title = title.replaceAll("</a>|</A>|<font .*?>|<FONT .*?>|</font>|</FONT>|>", "");
				System.out.println("Title: "+title);
			}
			System.out.println("Link: "+link);
			System.out.println("");
		}
		
		return false;
	}

}
